dcm_prev <- 0.004 # DCM prevalence
gene_pos_pct <- 0.35 # proportion of DCM cases gene positive
gene_vus_pct <- 0.15 # proportion of DCM cases variant of unknown significance
gene_neg_pct <- 0.50 # proportion of DCM cases gene negative
n_individuals <- 500000
Here we simulate a dataset of 500,000 individuals using a DCM prevalence of 0.40% and percentage of gene positive, gene negative, and variant of unknown significance (VUS) of 35%, 50%, and 15%, respectively. We adjust the means of the polygenic risk score such that the gene positive cohort has the maximum, followed by VUS, then gene negative. Non-DCM individuals have a mean PGS of 0.
# set prevalences and PGS means
gene_statuses <- list(
`No DCM` = list(prop = 1-dcm_prev, mean_pgs = 0.00),
`DCM gene +ve` = list(prop = dcm_prev*gene_pos_pct, mean_pgs = 1.00),
`DCM gene vus` = list(prop = dcm_prev*gene_vus_pct, mean_pgs = 0.85),
`DCM gene -ve` = list(prop = dcm_prev*gene_neg_pct, mean_pgs = 0.60)
)
# reproducibility
set.seed(123)
# simulate te dataset
dat <- data.table(
id = sprintf(paste0("ID_%0", ceiling(log10(n_individuals+1)), "d"), 1:n_individuals),
sex = factor(sample(c("male", "female"), n_individuals, replace = TRUE), levels = c("male", "female")),
gene_status = factor(sample(x = names(gene_statuses),
size = n_individuals,
replace = TRUE,
prob = sapply(gene_statuses, function(x) x$prop)),
levels = names(gene_statuses))
)
dat[, dcm_status := factor(gene_status=="No DCM", levels=c(T,F), labels=c("No DCM", "DCM"))]
dat[, pgs := rnorm(.N, mean = gene_statuses[[as.character(gene_status)]]$mean_pgs, sd = 1), by = .(gene_status)]
dat[, age := ifelse(dcm_status == "DCM",
pmin(pmax(rnorm(.N, mean = 55, sd = 10), 18), 95),
pmin(pmax(rnorm(.N, mean = 45, sd = 15), 18), 95))]
dat[, age_cat := cut(age,
breaks = seq(0, 100, by = 20),
labels = c("0-19", "20-39", "40-59", "60-79", "80+"),
right = FALSE)]
# view
knitr::kable(unique(dat, by="gene_status"))
| id | sex | gene_status | dcm_status | pgs | age | age_cat |
|---|---|---|---|---|---|---|
| ID_000001 | male | No DCM | No DCM | 0.2620001 | 66.80026 | 60-79 |
| ID_000498 | male | DCM gene +ve | DCM | 2.5414200 | 60.93302 | 60-79 |
| ID_000741 | male | DCM gene -ve | DCM | 2.4990527 | 60.42208 | 60-79 |
| ID_005589 | female | DCM gene vus | DCM | 0.5613612 | 62.78699 | 60-79 |
We then calculate the odds ratio for DCM by PGS bin. We define appropriate cut-offs for the PGS for “pathogenic” and “intermediate” criteria.
| or_thresh | thresh_lab |
|---|---|
| 1.5 | intermediate |
| 5.0 | pathogenic |
We calculate the odds ratio by PGS bin and fit a spline model to help extract the quantile at which the thresholds are reached.
# calculate the odds ratio of DCM by PGS quantile
n_quantiles <- 10
quantiles <- c(0, 1, 5, 10, 20, 40, 60, 80, 90, 95, 99, 100)
dat[, pgs_quantile := cut(pgs,
breaks = quantile(pgs, probs = quantiles/100, na.rm = TRUE),
labels = c(paste0("<",quantiles[2]),
paste0(quantiles[2:(length(quantiles)-2)], "-", quantiles[3:(length(quantiles)-1)]),
paste0(">", quantiles[length(quantiles)-1])),
include.lowest = TRUE)]
dat[, pgs_percentile := cut(pgs,
breaks = quantile(pgs, probs = seq(0,1,0.01), na.rm = TRUE),
labels = seq(1,100,1),
include.lowest = TRUE)]
or_table <- data.table(pgs_quantile = unique(dat$pgs_quantile))
or_table[, `:=`(
dcm_cases_in_group = dat[pgs_quantile == .BY[[1]] & dcm_status != "No DCM", .N],
dcm_controls_in_group = dat[pgs_quantile == .BY[[1]] & dcm_status == "No DCM", .N],
dcm_cases_rest = dat[pgs_quantile != .BY[[1]] & dcm_status != "No DCM", .N],
dcm_controls_rest = dat[pgs_quantile != .BY[[1]] & dcm_status == "No DCM", .N]
), by = pgs_quantile]
or_table[, `:=`(
odds_in_group = dcm_cases_in_group / dcm_controls_in_group,
odds_in_rest = dcm_cases_rest / dcm_controls_rest
)]
or_table[, `:=`(odds_ratio = odds_in_group / odds_in_rest,
se_log_or = sqrt(1 / dcm_cases_in_group +
1 / dcm_controls_in_group +
1 / dcm_cases_rest +
1 / dcm_controls_rest))]
or_table[, `:=`(log_or = log(odds_ratio))]
or_table[, `:=`(ci_lower = exp(log_or - 1.96 * se_log_or),
ci_upper = exp(log_or + 1.96 * se_log_or))]
# model the odds ratio across PGS quantile and extract the thresholds of 5 and 1.5
or_table[, pgs_midpoint := as.numeric(sub("-.*", "", as.character(pgs_quantile))) + (as.numeric(sub(".*-", "", as.character(pgs_quantile))) - as.numeric(sub("-.*", "", as.character(pgs_quantile)))) / 2]
or_table[as.character(pgs_quantile) == "<1", pgs_midpoint := 0.5]
or_table[as.character(pgs_quantile) == ">99", pgs_midpoint := 99.5]
spline_model <- lm(log(odds_ratio) ~ ns(pgs_midpoint, df = 6), data = or_table)
pred_pgs_values <- seq(0,100,0.01)
predicted_or <- data.table(pgs_quantile_val = pred_pgs_values,
pred_or = exp(predict(spline_model, newdata = data.frame(pgs_midpoint = pred_pgs_values))))
# predict the thresholds from the fit
pgs_thresholds[, pgs_cutoff := sapply(or_thresh, function(x) predicted_or[which.min(abs(pred_or-x)), pgs_quantile_val])]
pgs_thresholds[, pgs := sapply(pgs_cutoff, function(x) dat[ceiling(x)==as.numeric(pgs_percentile), mean(pgs, na.rm=T)])]
dat <- dat[
pgs_thresholds,
on = .(pgs > pgs),
`:=`(thresh_lab = paste0(i.thresh_lab, " PGS"))
]
dat[is.na(thresh_lab), `:=`(thresh_lab = "low PGS")]
# view
knitr::kable(unique(dat, by="thresh_lab"))
| id | sex | gene_status | dcm_status | pgs | age | age_cat | pgs_quantile | pgs_percentile | thresh_lab |
|---|---|---|---|---|---|---|---|---|---|
| ID_000001 | male | No DCM | No DCM | 0.2620001 | 66.80026 | 60-79 | 60-80 | 61 | low PGS |
| ID_000007 | female | No DCM | No DCM | 1.0415965 | 18.00000 | 0-19 | 80-90 | 86 | intermediate PGS |
| ID_000210 | female | No DCM | No DCM | 2.2827584 | 53.80471 | 40-59 | 95-99 | 99 | pathogenic PGS |
We then annotate the DCM cases in the dataset as to whether they have a pathogenic, intermediate or low PGS.
We derive the odds-ratio per standard deviation chnage in PGS for DCM use this to calculate the likelihood ratios, and population attributable risk, by PGS bin.
# Likelihood ratios and attributable risk faction
# calculate the odds ratio per Sdev of the PGS
model <- glm(dcm_status ~ pgs, data = dat, family = "binomial")
summary_model <- summary(model)
odds_ratio_sd <- exp(coef(summary_model)["pgs", "Estimate"] * sd(dat$pgs))
odds_ratio_sd_lci <- exp((coef(summary_model)["pgs", "Estimate"] - 1.96*coef(summary_model)["pgs", "Std. Error"]) * sd(dat$pgs))
odds_ratio_sd_uci <- exp((coef(summary_model)["pgs", "Estimate"] + 1.96*coef(summary_model)["pgs", "Std. Error"]) * sd(dat$pgs))
attributable_risk_frac <- function(prev, or_per_sd, centile) {
odds <- prev / (1-prev)
u.A <- log(or_per_sd)
u.U <- 0
u.diff <- u.A - u.U
z.U <- qnorm(centile)
z.A <- z.U - u.A
pgs_centile_aff <- pnorm(z.A)
L.U <- dnorm(z.U)
L.A <- dnorm(z.A)
LR <- L.A / L.U
posttest.odds <- odds * LR
posttest.P <- posttest.odds / (posttest.odds + 1)
afe <- 1 - (prev / posttest.P)
list(LR = LR, afe = afe)
}
likelihood_ratio <- data.table(
dcm.P = dcm_prev,
pgs_or_per_sd = odds_ratio_sd,
pgs_or_per_sd_lci = odds_ratio_sd_lci,
pgs_or_per_sd_uci = odds_ratio_sd_uci,
pgs_centile_unaff = seq(0.01,0.99,0.01)
)
likelihood_ratio[, c("LR", "afe") := attributable_risk_frac(dcm.P, pgs_or_per_sd, pgs_centile_unaff)]
likelihood_ratio[, c("LR_lci", "afe_lci") := attributable_risk_frac(dcm.P, pgs_or_per_sd_lci, pgs_centile_unaff)]
likelihood_ratio[, c("LR_uci", "afe_uci") := attributable_risk_frac(dcm.P, pgs_or_per_sd_uci, pgs_centile_unaff)]
Dilated cardiomyopathy (DCM) is a primary cardiac muscle disorder characterised by abnormal structure and function of the heart with a prevalence of 0.4%. In terms of aetiology, it is an apparently heterogeneous condition with monogenic genetic causes to environmental exposures. DCM is an important diagnosis, carrying high morbidity and mortality rates from complications such as heart failure and arrhythmia. Current genetic testing identifies a culprit gene in approximately 15-25% of sporadic cases and 20-40% of patients with familial DCM. However, the heritability of DCM is XX estimated from common variants suggesting that there exists an underlying propensity to the development of DCM, how this interacts with different exposures over the life course is not fully understood.
In the clinic we are often faced with so called gene-elusive DCM, patients who display the phenotype but lack a known pathogenic variant. Polygenic risk scores (PGS) can be used to determine the overall burden of DCM associated variants, identified from large genome-wide association studies. PGSs represents the weighted sum on independent genetic variants associated with a disease, in any given individual. There is great interest in how PGSs might be incorporated into clinical use and population screening.1,2
PGSs are often interpreted as the amount of risk increase in disease per standard deviation increase in polygenic risk. Conversion of PGSs to clinically meaningful performance metrics (odds of being affected, or positive predictive value) can be derived from the overlap in polygenic risk score distributions between affected and unaffected groups. It is mathematically simple and has been described in detail elsewhere.3,4 Additionally, with knowledge of the baseline prevalence of DCM in the general population this can be leveraged to estimate the population attributable risk for an individual.
Currently, it is unclear how the family of gene-elusive patients should be screened and followed up. Current guidelines suggest … however the penetrance of cases is low and better stratification is needed in order to utilised resources most effectively.
Here, we show how a polygenic risk score for DCM, with appropriate thresholds, can be used in the diagnosis of ‘polygenic DCM’ and the population attributable risk for any particular genome in order to better communicate to patients the likely aetiology of their underlying condition.
Sequencing data for 2,XXX individuals of European ancestry referred for dilated cardiomyopathy were collection from XX centers: X, Y, X. XX European ancestry controls were identified from the UK Biobank.
Rare variant status was determined by … and individuals codes as: gene positive (pathogenic/like-pathogenic), variant of unknown significance, and gene negative.
All research participants provided written informed consent, and the studies were reviewed and approved by the relevant research ethics committee (X, Y, Z).
Base data for DCM association statistics were obtained from the latest HERMES consortium GWAS.5
Genotyping as performed using the XX Array. Pre-imputation quality control included X, Y, Z. Imputation was conducted using the Michigan Imputation Server with TOPMed Freeze5 reference panel[???].
Common genetic variant effect sizes obtained from previously published GWAS summary statistics were used as weightings in the generation of the PGS for DCM. DCM GWAS summary statistics were filtered to exclude rare variants (MAF < 1%) and ambiguous variants not resolvabnle by strand flipping.
The XXX software was used for generation of the PGS with parameters X, Y, Z. PGS value for each individual was calculated in the standard way, using genotype dosage for each allele, multiplied by its weight, summed across all variants.6
The distribution of DCM patients by DCM-gene status is presented in Figure 1. XX% of patients carried and pathogenic or likely-pathogenic genetic variant, XX% of patients carried a variant of unknown significance, and XX% of patients were classified as gene elusive DCM.
The distributions of PGS by DCM-gene status and unaffected individuals is presented in Figure 2. The mean PGS was greatest in the gene-positive DCM cohort (XX), followed with the cohort with a variant of unknown significance (XX), and gene negative DCM (XX). Unaffected individuals had the lowest mean PGS (XX).
The PGS at a ‘pathogenic’ odds-ratio threshold of >=5 for DCM was XX and for the ‘intermediate’ odds-ratio threshold of 1.5 was XX (Figure 3). Categorising gene-elusive DCM cases using these thresholds showed that XX% of gene-elusive patients had a pathogenic DCM PGS and XX% had an intermediate PGS (Figure 4).
The population attributable risk (PAR) per PGS centile is presented in Figure 5. Pathogenic and intermediate PGS thresholds had an explained PAR of XX% and XX% respectively.
The survival curves by polygenic risk stratification are shown in Figure X.
Pathogenic genetic variants explain roughly XX% of cases of dilate cardiomyopathy, leaving a remaining XX% so called gene-elusive cases. The polygenic risk in these remaining cases is substantial and likely explanatory in a significant number of patients.
…
Whilst PGSs can explain significant amounts of variability in traits, concerns have been raised on the variability of different scores, equally performant on the population level, on the individual level, as well as their use in population screening, owing to substantial overlap in PGS distributions amongst the affected and unaffected.4,7 It has been estimated that an 80% detection rate for a 5% false positive rate requires an odds ratio for one standard deviation increase in PGS of 12.4
Notwithstanding the limitations of PGSs in defining individual patient risk and population screening, we demonstrate here the utility of PGS in quantifying the underlying polygenic risk in patients who are manifestly DCM patients but in whom a likely single causative genetic variant has not been found. We propose that this is clinically useful information, informing patient discussions regarding their disease and potential implications for family members.
Strategies to effectively communicate risk estimates derived from PGSs to clinicians and patients are required, especially when communicating results to unaffected individuals where the psychosocial impact could be substantial if not managed appropriately.
PGSs may help explain the variable penetrance of DCM in family members with identified point mutations but without the clinical phenotype. Polygenic hazard scores could be used to determine the likely age of onset of DCM based on an individual’s genetic architecture.8 The genetic architecture of DCM is likely a combination of common low-risk variants (polygenic risk) and rare high-risk (monogenic, familial) variants9. Incorporating the interaction between polygenic risk and rare variants is desirable but requires further investigation within well designed studies.
Implementation of a dilated cardiomyopathy polygenic risk score can increase the diagnostic yield by X% with ‘polygenic’ cases of DCM. This has important implications for family screening and ongoing surveillance of individuals with gene-elusive DCM.
The code used to generate these analyses and figures is available on GitHub. UK Biobank data is available on application.
…
# calculate the odds ratio of DCM by PGS quantile
dat2 <- copy(dat)
n_quantiles <- 10
quantiles <- c(0, 1, 5, 10, 20, 40, 60, 80, 90, 95, 99, 100)
dat2[, pgs_quantile := cut(pgs,
breaks = quantile(pgs, probs = quantiles/100, na.rm = TRUE),
labels = c(paste0("<",quantiles[2]),
paste0(quantiles[2:(length(quantiles)-2)], "-", quantiles[3:(length(quantiles)-1)]),
paste0(">", quantiles[length(quantiles)-1])),
include.lowest = TRUE), by="age_cat"]
dat2[, pgs_percentile := cut(pgs,
breaks = quantile(pgs, probs = seq(0,1,0.01), na.rm = TRUE),
labels = seq(1,100,1),
include.lowest = TRUE), by="age_cat"]
or_table2 <- expand.grid(pgs_quantile = unique(dat2$pgs_quantile),
age_cat = unique(dat2$age_cat)) |> as.data.table()
continuity_correction <- 0.5
or_table2[, `:=`(
dcm_cases_in_group = dat2[pgs_quantile == .BY[[1]] & age_cat == .BY[[2]] & dcm_status != "No DCM", .N + continuity_correction],
dcm_controls_in_group = dat2[pgs_quantile == .BY[[1]] & dcm_status == "No DCM", .N + continuity_correction],
dcm_cases_rest = dat2[pgs_quantile != .BY[[1]] & age_cat == .BY[[2]] & dcm_status != "No DCM", .N + continuity_correction],
dcm_controls_rest = dat2[pgs_quantile != .BY[[1]] & dcm_status == "No DCM", .N + continuity_correction]
), by = .(pgs_quantile, age_cat)]
or_table2[, `:=`(
odds_in_group = dcm_cases_in_group / dcm_controls_in_group,
odds_in_rest = dcm_cases_rest / dcm_controls_rest
)]
or_table2[, `:=`(odds_ratio = odds_in_group / odds_in_rest,
se_log_or = sqrt(1 / dcm_cases_in_group +
1 / dcm_controls_in_group +
1 / dcm_cases_rest +
1 / dcm_controls_rest))]
or_table2[, `:=`(log_or = log(odds_ratio))]
or_table2[, `:=`(ci_lower = exp(log_or - 1.96 * se_log_or),
ci_upper = exp(log_or + 1.96 * se_log_or))]
# model the odds ratio across PGS quantile and extract the thresholds of 5 and 1.5
or_table2[, pgs_midpoint := as.numeric(sub("-.*", "", as.character(pgs_quantile))) + (as.numeric(sub(".*-", "", as.character(pgs_quantile))) - as.numeric(sub("-.*", "", as.character(pgs_quantile)))) / 2]
or_table2[as.character(pgs_quantile) == "<1", pgs_midpoint := 0.5]
or_table2[as.character(pgs_quantile) == ">99", pgs_midpoint := 99.5]
spline_models <- or_table2[, .(
model = list(lm(log(odds_ratio) ~ ns(pgs_midpoint, df = 6), data = .SD))
), by = age_cat]
pred_pgs_values <- seq(0,100,0.01)
predicted_or <- spline_models[, .(
pgs_quantile_val = pred_pgs_values,
pred_or = exp(predict(model[[1]], newdata = data.frame(pgs_midpoint = pred_pgs_values)))
), by = age_cat]
# predict the thresholds from the fit
pgs_thresholds <- data.table(
or_thresh = rep(c(1.5, 5.0), length(unique(predicted_or$age_cat))),
thresh_lab = rep(c("intermediate", "pathogenic"), length(unique(predicted_or$age_cat))),
age_cat = rep(unique(predicted_or$age_cat), 2)
)
pgs_thresholds[, pgs_cutoff := mapply(function(thresh, age) predicted_or[age_cat==age][which.min(abs(pred_or-thresh)), pgs_quantile_val], or_thresh, age_cat)]
foo <- ggplot(or_table2[age_cat!="0-19"], aes(x = pgs_midpoint, y = odds_ratio)) +
geom_hline(yintercept = 1.0, color="darkgray", linetype = "dotted") +
#geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width=0.3, color="darkgray") +
geom_point(size=4) +
geom_line(data = predicted_or[age_cat!="0-19"],
aes(x = pgs_quantile_val, y = pred_or, group=1), color = "darkred", inherit.aes = FALSE) +
geom_segment(data = pgs_thresholds[age_cat!="0-19"], aes(x=0, xend = pgs_cutoff, y = or_thresh, yend = or_thresh, color = thresh_lab), inherit.aes = F, linetype = "dashed") +
geom_segment(data = pgs_thresholds[age_cat!="0-19"], aes(y=0, yend = or_thresh, x =pgs_cutoff, xend=pgs_cutoff, color = thresh_lab), inherit.aes = F, linetype = "dashed") +
geom_text(data = pgs_thresholds[age_cat!="0-19"], aes(x=0, y = or_thresh, label = paste0(thresh_lab, " OR=",or_thresh, " (", sprintf("%.0f%%", pgs_cutoff), ")"), color = thresh_lab), vjust=-1, hjust=0, inherit.aes = F, show.legend = FALSE) +
theme_classic(base_size = 22) +
labs(x = "PGS quantile",
y = "OR for DCM") +
guides(color = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1, vjust=1)) +
facet_wrap(~age_cat, scales="free")
foo
png(file.path(dir, "figures", "foo.png"), width=1000, height=700)
foo
invisible(dev.off())